In [ ]:
import pandas as pd
import numpy as np
sentiment_data = pd.read_csv("naver_ratings.txt",sep='\t')
In [ ]:
# 네이버 평점 기준 label == 1(긍정) 0(부정)
sentiment_data.head()
In [ ]:
# select random sample data
def select_reviews(data, num,label):
label_data = data[data.label==label].reset_index()
index = np.random.randint(100000,size=num)
return label_data.ix[index]
In [ ]:
# postive and negative data random sampling
positive_data = select_reviews(sentiment_data,100,1)
negative_data = select_reviews(sentiment_data,100,0)
In [ ]:
# head of sample positive data
positive_data.head()
In [ ]:
# head of sample negative data
negative_data.head()
In [ ]:
# documents merging
def doc_merge(data):
merged_docs = []
for doc in data["document"]:
merged_docs.append(doc)
return merged_docs
In [ ]:
positive_merged_doc = doc_merge(positive_data)
negative_merged_doc = doc_merge(negative_data)
In [ ]:
positive_merged_doc[0]
In [ ]:
from konlpy.tag import Kkma
kkma = Kkma()
positive_pos = []
for doc in positive_merged_doc:
for pos in kkma.pos(doc):
if pos[1][0]=='M':
positive_pos.append(pos[0])
negative_pos = []
for doc in negative_merged_doc:
for pos in kkma.pos(doc):
if pos[1][0]=='M':
negative_pos.append(pos[0])
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfTransformer
wordcloud = WordCloud(font_path = r'C:\Windows\Fonts\Daum_SemiBold.ttf')
vectorizer = CountVectorizer(min_df=1)
pos_bow = vectorizer.fit_transform(positive_pos)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(pos_bow.toarray())
word_tf = zip(vectorizer.get_feature_names(),pos_bow.toarray()[1])
In [ ]:
wordcloud.generate_from_frequencies(word_tf).to_image()
In [ ]:
vectorizer = CountVectorizer(min_df=1)
neg_bow = vectorizer.fit_transform(negative_pos)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(neg_bow.toarray())
word_tf = zip(vectorizer.get_feature_names(),neg_bow.toarray()[1])
In [ ]:
wordcloud.generate_from_frequencies(word_tf).to_image()
In [ ]:
#positive_data2 = select_reviews(sentiment_data,1000,1)
#negative_data2 = select_reviews(sentiment_data,1000,0)
positive_data.head()
positive_pos
In [ ]:
# data merge
total_data = pd.concat((positive_data,negative_data))
In [ ]:
print(len(positive_data), len(negative_data), len(total_data))
In [ ]:
X = total_data['document']
y = total_data['label']
In [ ]:
# bow matrix generate & training_test split
from sklearn.cross_validation import train_test_split
vectorizer = CountVectorizer(min_df=1)
bow = vectorizer.fit_transform(X)
X_trn, X_tst, y_trn, y_tst = train_test_split(bow, y,test_size = 0.3)
In [ ]:
from sklearn import linear_model
model = linear_model.LogisticRegression(penalty='l2')
model.fit(X_trn,y_trn)
In [ ]:
y_pred = model.predict(X_tst)
In [ ]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_tst,y_pred)
In [ ]:
acc = (conf_mat[0,0] + conf_mat[1,1])/sum(sum(conf_mat))
print("accuracy = %f"%acc)
In [ ]:
test_sentiment = ['꼭 보세요 강추합니다 한번 더 보고 싶은 영화에요', '내가 이걸 왜 봤는지 모르겠다. 사전에 검색좀 해보고 볼걸 아.. 짜증나']
In [ ]:
# 기존 vocabulary를 가지고 새로운 bow matrix 만들기
vectorizer2 = CountVectorizer(min_df=1,vocabulary = vectorizer.vocabulary_)
In [ ]:
new_input = vectorizer2.fit_transform(test_sentiment)
print(new_input.get_shape())
In [ ]:
model.predict_proba(new_input.toarray())